Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
Univariate analysis (12 marks)
Multivariate analysis (8 marks)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style(style='darkgrid')
import plotly.express as px
from sklearn.model_selection import train_test_split as tts
from sklearn import tree
from sklearn import metrics as mtr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from yellowbrick.classifier import ROCAUC
pd.options.display.float_format = '{:,.4f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
df = pd.read_csv('bank-full.csv')
df.head()
df.rename(columns={'Target':'target'}, inplace=True)
df.shape
df[df.duration == 0]
# Dropping observations where the duration is 'zero' so as to not influence the model
df.drop(df[df.duration == 0].index, axis=0, inplace=True)
dfM = df.copy()
df.info()
df.isnull().sum()
df[df==0].count()
df.describe(include='all').T
df.hist(bins=30, figsize=[30,20])
plt.figure(figsize=[30,20])
sns.set(font_scale=1.15)
sns.pairplot(dfM, hue='target')
plt.show()
corr = dfM.corr()
plt.figure(figsize=[20,15])
sns.set(font_scale=1.15)
sns.heatmap(corr,annot=True,vmin=-1,vmax=1,center=0, linewidth=0.2, fmt='.2f', cmap='coolwarm')
plt.show()
fig = px.histogram(df, x='duration', marginal='box', color='target')
fig.show()
# Converting duration from continuous variable to categorical variable by grouping them into buckets
durationbins=[0,50,77,100,125,150,180,215,260,320,410,600,5000]
durationlabels=['1-50','50-77','77-100','100-125','125-150','150-180','180-215','215-260','260-320','320-410','410-600','600+']
df['durationCat'] = pd.cut(df.duration, bins=durationbins,labels=durationlabels)
df.durationCat.value_counts()
fig = px.histogram(df[df.previous < 50], x='previous', marginal='box', color='target')
fig.show()
fig = px.violin(df, color='target', y='previous', width=800)
fig.show()
df[df.previous == 0].target.value_counts()
df[df.previous > 0].target.value_counts()
# Converting previous from continuous variable to categorical variable by grouping them into buckets
previousbins=[-2,0,1,2,4,300]
previouslabels=['0','1','2','3-4','5+']
df['previousCat'] = pd.cut(df.previous, bins=previousbins,labels=previouslabels)
df.previousCat.value_counts().sort_values()
fig = px.histogram(df, x='pdays', marginal='box', color='target')
fig.show()
df[df.pdays == -1].target.value_counts()
df[df.pdays > -1].target.value_counts()
fig = px.scatter(df, x='previous', y='pdays', trendline="ols", width=600)
fig.show()
df.drop('pdays',axis=1, inplace=True)
fig = px.histogram(df, x='campaign', marginal='box', color='target')
fig.show()
# Converting campaign from continuous variable to categorical variable by grouping them into buckets
campaignbins=[-1,1,2,3,5,70]
campaignlabels=['1','2','3','4-5','5+']
df['campaignCat'] = pd.cut(df.campaign, bins=campaignbins,labels=campaignlabels)
df.campaignCat.value_counts().sort_values()
fig = px.histogram(df, x='balance', marginal='box', color='target')
fig.show()
# Converting balance from continuous variable to categorical variable by grouping them into buckets
balancebins=[-9000,-50,0,60,150,250,400,600,900,1400,2200,4000,103000]
balancelabels=['<(50)','(50)-0','0-60','60-150','150-250','250-400','400-600','600-900','900-1.4K','1.4K-2.2K','2.2K-4K','4K+']
df['balanceCat'] = pd.cut(df.balance, bins=balancebins,labels=balancelabels)
df.balanceCat.value_counts().sort_values()
fig = px.histogram(df, x='age', marginal='box', color='target')
fig.show()
fig = px.histogram(df, x='day', marginal='box', color='target')
fig.show()
df.info()
# With the addition of categorical variables, the original features are removed
df.drop(['duration','previous','campaign','balance'],axis=1, inplace=True)
# Categorical variables are converted from object datatype to category datatype
for col in ['job','marital','education','default','housing','loan','contact','month','poutcome']:
df[col] = df[col].astype('category')
df.info()
# Splitting the dependent variable from features
dfy = df.target.copy()
dfX = df.drop('target', axis=1)
# 1-hot-encoding the features / independent variables
dfX = pd.get_dummies(dfX)
dfX.info()
# Convert dependent variable (outcome) to binary (0 & 1)
ymap = {'yes':1, 'no':0}
dfy = dfy.replace(ymap)
dfy.value_counts()
# Splitting the data for training
dfX_tr, dfX_ts, dfy_tr, dfy_ts = tts(dfX, dfy, test_size=0.3, random_state=6, stratify=dfy)
dfy_tr.value_counts(normalize=True)*100
dfy_ts.value_counts(normalize=True)*100
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(solver='newton-cg')
logit.fit(dfX_tr,dfy_tr)
print('Training score : ',logit.score(dfX_tr, dfy_tr))
logit.fit(dfX_ts,dfy_ts)
print('Test score : ',logit.score(dfX_ts, dfy_ts))
y_pred1 = logit.predict(dfX_ts)
cmat1 = mtr.confusion_matrix(dfy_ts,y_pred1, labels=[1,0])
dfcm1 = pd.DataFrame(cmat1, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm1, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc1 = ROCAUC(logit)
roc1.fit(dfX_tr, dfy_tr)
roc1.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred1))
rocauc1 = roc1.score(dfX_ts, dfy_ts)
prec1 = mtr.precision_score(dfy_ts, y_pred1, average='macro')
recl1 = mtr.recall_score(dfy_ts, y_pred1, average='macro')
f1scr1 = mtr.f1_score(dfy_ts, y_pred1, average='macro')
accr1 = mtr.accuracy_score(dfy_ts, y_pred1)
print('ROC-AUC (by scoring) : ',rocauc1)
print('Precision Macro : ',prec1)
print('Recall Macro : ',recl1)
print('F1 Macro : ',f1scr1)
print('Accuracy : ',accr1)
compare = pd.DataFrame('',index=['Precision Macro','Recall Macro','F1 Macro','Accuracy','ROC-AUC'],columns=['Logit','DecTree','Bagging','AdaBoost','GradBoost','RandomForest'])
compare.Logit = np.array([prec1,recl1,f1scr1,accr1,rocauc1])
from sklearn.tree import DecisionTreeClassifier as dtc
param_grid = {
'max_depth':[10,15,20,25,50]
}
grid = GridSearchCV(dtc(criterion='gini',splitter='best'),param_grid,refit=True, verbose=True, n_jobs=-1, scoring='recall_macro')
gfit = grid.fit(dfX_tr,dfy_tr)
# print best parameter after tuning
print('Best Params : ',grid.best_params_)
# print classification report
grid_predictions = grid.predict(dfX_ts)
print(mtr.classification_report(dfy_ts, grid_predictions))
print("Best score: %0.3f" % gfit.best_score_)
dtree = dtc(criterion='gini',splitter='best',max_depth=20 )
dtree.fit(dfX_tr, dfy_tr)
print('Training Score : ',dtree.score(dfX_tr, dfy_tr))
print('Test Score : ',dtree.score(dfX_ts, dfy_ts))
y_pred2 = dtree.predict(dfX_ts)
cmat2 = mtr.confusion_matrix(dfy_ts,y_pred2, labels=[1,0])
dfcm2 = pd.DataFrame(cmat2, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm2, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc2 = ROCAUC(dtree)
roc2.fit(dfX_tr, dfy_tr)
roc2.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred2))
rocauc2 = roc2.score(dfX_ts, dfy_ts)
prec2 = mtr.precision_score(dfy_ts, y_pred2, average='macro')
recl2 = mtr.recall_score(dfy_ts, y_pred2, average='macro')
f1scr2 = mtr.f1_score(dfy_ts, y_pred2, average='macro')
accr2 = mtr.accuracy_score(dfy_ts, y_pred2)
print('ROC-AUC (by scoring) : ',rocauc2)
print('Precision Macro : ',prec2)
print('Recall Macro : ',recl2)
print('F1 Macro : ',f1scr2)
print('Accuracy : ',accr2)
compare.DecTree = np.array([prec2,recl2,f1scr2,accr2,rocauc2])
# compare
from sklearn.ensemble import BaggingClassifier
param_grid = {
'n_estimators':[9,11,13,15],
'max_features':[60,70,80,90]
}
grid = GridSearchCV(BaggingClassifier(max_samples=25000),param_grid,refit=True, verbose=True, n_jobs=-1, scoring='recall_macro')
gfit = grid.fit(dfX_tr,dfy_tr)
# print best parameter after tuning
print('Best Params : ',grid.best_params_)
# print classification report
grid_predictions = grid.predict(dfX_ts)
print(mtr.classification_report(dfy_ts, grid_predictions))
print("Best score: %0.3f" % gfit.best_score_)
bgcl = BaggingClassifier(max_features=80, max_samples=25000, n_estimators=13)
bgcl = bgcl.fit(dfX_tr, dfy_tr)
print('Training Score : ',bgcl.score(dfX_tr, dfy_tr))
print('Test Score : ',bgcl.score(dfX_ts, dfy_ts))
y_pred3 = bgcl.predict(dfX_ts)
cmat3 = mtr.confusion_matrix(dfy_ts,y_pred3, labels=[1,0])
dfcm3 = pd.DataFrame(cmat3, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm3, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc3 = ROCAUC(bgcl)
roc3.fit(dfX_tr, dfy_tr)
roc3.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred3))
rocauc3 = roc3.score(dfX_ts, dfy_ts)
prec3 = mtr.precision_score(dfy_ts, y_pred3, average='macro')
recl3 = mtr.recall_score(dfy_ts, y_pred3, average='macro')
f1scr3 = mtr.f1_score(dfy_ts, y_pred3, average='macro')
accr3 = mtr.accuracy_score(dfy_ts, y_pred3)
print('ROC-AUC (by scoring) : ',rocauc3)
print('Precision Macro : ',prec3)
print('Recall Macro : ',recl3)
print('F1 Macro : ',f1scr3)
print('Accuracy : ',accr3)
compare.Bagging = np.array([prec3,recl3,f1scr3,accr3,rocauc3])
# compare
from sklearn.ensemble import AdaBoostClassifier
dtr3 = dtc(criterion='gini',splitter='best',max_depth=3)
dtr4 = dtc(criterion='gini',splitter='best',max_depth=4)
dtr5 = dtc(criterion='gini',splitter='best',max_depth=5)
param_grid = {
'learning_rate':[0.9,1.0,1.1,1.2,1.5],
'n_estimators':[40,50,75]
}
grid = GridSearchCV(AdaBoostClassifier(base_estimator=dtr4,algorithm='SAMME.R'),param_grid,refit=True, verbose=True, n_jobs=-1, scoring='recall_macro')
gfit = grid.fit(dfX_tr,dfy_tr)
# print best parameter after tuning
print('Best Params : ',grid.best_params_)
# print classification report
grid_predictions = grid.predict(dfX_ts)
print(mtr.classification_report(dfy_ts, grid_predictions))
print("Best score: %0.3f" % gfit.best_score_)
abcl = AdaBoostClassifier(base_estimator=dtr4,algorithm='SAMME.R',learning_rate=1.0,n_estimators=50)
abcl = abcl.fit(dfX_tr, dfy_tr)
print('Training Score : ',abcl.score(dfX_tr, dfy_tr))
print('Test Score : ',abcl.score(dfX_ts, dfy_ts))
y_pred4 = abcl.predict(dfX_ts)
cmat4 = mtr.confusion_matrix(dfy_ts,y_pred4, labels=[1,0])
dfcm4 = pd.DataFrame(cmat4, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm4, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc4 = ROCAUC(abcl)
roc4.fit(dfX_tr, dfy_tr)
roc4.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred4))
rocauc4 = roc4.score(dfX_ts, dfy_ts)
prec4 = mtr.precision_score(dfy_ts, y_pred4, average='macro')
recl4 = mtr.recall_score(dfy_ts, y_pred4, average='macro')
f1scr4 = mtr.f1_score(dfy_ts, y_pred4, average='macro')
accr4 = mtr.accuracy_score(dfy_ts, y_pred4)
print('ROC-AUC (by scoring) : ',rocauc4)
print('Precision Macro : ',prec4)
print('Recall Macro : ',recl4)
print('F1 Macro : ',f1scr4)
print('Accuracy : ',accr4)
compare.AdaBoost = np.array([prec4,recl4,f1scr4,accr4,rocauc4])
# compare
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
# 'loss':['deviance', 'exponential'],
'learning_rate':[0.05,0.1,0.5],
'n_estimators':[40,50,60],
'subsample':[0.1,0.5,1.0],
# 'criterion' : ['friedman_mse', 'mse', 'mae'],
'max_depth':[10,20,50],
'min_impurity_decrease':[0.001,0.1,0.2]
# 'max_features':['auto', 'sqrt', 'log2']
}
grid = GridSearchCV(GradientBoostingClassifier(criterion='friedman_mse',loss='deviance',max_features='auto'),param_grid,refit=True, verbose=True, n_jobs=-1, scoring='recall_macro')
gfit = grid.fit(dfX_tr,dfy_tr)
# print best parameter after tuning
print('Best Params : ',grid.best_params_)
# print classification report
grid_predictions = grid.predict(dfX_ts)
print(mtr.classification_report(dfy_ts, grid_predictions))
print("Best score: %0.3f" % gfit.best_score_)
gbcl = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.1, loss='exponential', max_depth=5, max_features='auto', min_impurity_decrease=0.2, n_estimators=60, subsample=0.2)
gbcl = gbcl.fit(dfX_tr, dfy_tr)
print('Training Score : ',gbcl.score(dfX_tr, dfy_tr))
print('Test Score : ',gbcl.score(dfX_ts, dfy_ts))
y_pred5 = gbcl.predict(dfX_ts)
cmat5 = mtr.confusion_matrix(dfy_ts,y_pred5, labels=[1,0])
dfcm5 = pd.DataFrame(cmat5, index=['Yes','No'],columns=['Yes','No'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm5, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc5 = ROCAUC(gbcl)
roc5.fit(dfX_tr, dfy_tr)
roc5.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred5))
rocauc5 = roc5.score(dfX_ts, dfy_ts)
prec5 = mtr.precision_score(dfy_ts, y_pred5, average='macro')
recl5 = mtr.recall_score(dfy_ts, y_pred5, average='macro')
f1scr5 = mtr.f1_score(dfy_ts, y_pred5, average='macro')
accr5 = mtr.accuracy_score(dfy_ts, y_pred5)
print('ROC-AUC (by scoring) : ',rocauc5)
print('Precision Macro : ',prec5)
print('Recall Macro : ',recl5)
print('F1 Macro : ',f1scr5)
print('Accuracy : ',accr4)
compare.GradBoost = np.array([prec5,recl5,f1scr5,accr5,rocauc5])
# compare
from sklearn.ensemble import RandomForestClassifier
param_grid = {
'n_estimators':[60,70,80,90,100,120,150],
'max_depth':[10,15,20,30,40,50]
}
grid = GridSearchCV(RandomForestClassifier(criterion='gini', class_weight='balanced', max_features='auto', min_impurity_decrease=0.001),param_grid,refit=True, verbose=True, n_jobs=-1, scoring='recall_macro')
gfit = grid.fit(dfX_tr,dfy_tr)
# print best parameter after tuning
print('Best Params : ',grid.best_params_)
# print classification report
grid_predictions = grid.predict(dfX_ts)
print(mtr.classification_report(dfy_ts, grid_predictions))
print("Best score: %0.3f" % gfit.best_score_)
rfcl = RandomForestClassifier(criterion='gini', class_weight='balanced', max_features='auto', min_impurity_decrease=0.001, max_depth=20, n_estimators=80)
rfcl = rfcl.fit(dfX_tr, dfy_tr)
print('Training Score : ',rfcl.score(dfX_tr, dfy_tr))
print('Test Score : ',rfcl.score(dfX_ts, dfy_ts))
y_pred6 = rfcl.predict(dfX_ts)
cmat6 = mtr.confusion_matrix(dfy_ts,y_pred6, labels=[1,0])
dfcm6 = pd.DataFrame(cmat6, index=['Yes','No'],columns=['Yes','No'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm6, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
roc6 = ROCAUC(rfcl)
roc6.fit(dfX_tr, dfy_tr)
roc6.score(dfX_ts, dfy_ts)
# print classification report
print(mtr.classification_report(dfy_ts,y_pred6))
rocauc6 = roc6.score(dfX_ts, dfy_ts)
prec6 = mtr.precision_score(dfy_ts, y_pred6, average='macro')
recl6 = mtr.recall_score(dfy_ts, y_pred6, average='macro')
f1scr6 = mtr.f1_score(dfy_ts, y_pred6, average='macro')
accr6 = mtr.accuracy_score(dfy_ts, y_pred6)
print('ROC-AUC (by scoring) : ',rocauc6)
print('Precision Macro : ',prec6)
print('Recall Macro : ',recl6)
print('F1 Macro : ',f1scr6)
print('Accuracy : ',accr6)
compare.RandomForest = np.array([prec6,recl6,f1scr6,accr6,rocauc6])
compare
cmpr = compare.T
cmpr = compare.reset_index()
cmpr = cmpr.melt(id_vars='index',value_vars=['Logit','DecTree','Bagging','AdaBoost','GradBoost','RandomForest'], var_name='Model',value_name='Value')
# cmpr
fig = px.line(cmpr, x='Model', y='Value', color='index', range_y=[0.6,1.0], width=900, height=600)
fig.data[0].update(mode='markers+lines')
fig.data[1].update(mode='markers+lines')
fig.data[2].update(mode='markers+lines')
fig.data[3].update(mode='markers+lines')
fig.data[4].update(mode='markers+lines')
fig.show()